# -*- coding: utf-8 -*-

r'''A DOT language parser using funcparserlib.

The parser is based on [the DOT grammar][1]. It is pretty complete with a few
not supported things:

* String ecapes `\"`
* Ports and compass points
* XML identifiers

At the moment, the parser builds only a parse tree, not an abstract syntax tree
(AST) or an API for dealing with DOT.

  [1]: http://www.graphviz.org/doc/info/lang.html
'''

import sys, os
from pprint import pformat
from funcparserlib.lexer import make_tokenizer, Spec
from funcparserlib.parser import (maybe, many, eof, skip, oneplus, fwd,
        name_parser_vars, memoize, SyntaxError)
from funcparserlib.util import pretty_tree
from funcparserlib.contrib.common import unarg, flatten, n, op, op_, sometoks
from funcparserlib.contrib.lexer import (make_comment, make_multiline_comment,
        newline, space)

try:
    from collections import namedtuple
except ImportError:
    # Basic implementation of namedtuple for 2.1 < Python < 2.6
    def namedtuple(name, fields):
        'Only space-delimited fields are supported.'
        def prop(i, name):
            return (name, property(lambda self: self[i]))
        methods = dict(prop(i, f) for i, f in enumerate(fields.split(' ')))
        methods.update({
            '__new__': lambda cls, *args: tuple.__new__(cls, args),
            '__repr__': lambda self: '%s(%s)' % (
                name,
                ', '.join('%s=%r' % (
                    f, getattr(self, f)) for f in fields.split(' ')))})
        return type(name, (tuple,), methods)

ENCODING = 'utf-8'

Graph = namedtuple('Graph', 'strict type id stmts')
SubGraph = namedtuple('SubGraph', 'id stmts')
Node = namedtuple('Node', 'id attrs')
Attr = namedtuple('Attr', 'name value')
Edge = namedtuple('Edge', 'nodes attrs')
DefAttrs = namedtuple('DefAttrs', 'object attrs')

def tokenize(str):
    'str -> Sequence(Token)'
    specs = [
        make_multiline_comment(r'/\*', r'\*/'),
        make_comment(r'//'),
        newline,
        space,
        Spec('name',    r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*'),
        Spec('op',      r'[{};,=\[\]]|(->)|(--)'),
        Spec('number',  r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)'),
        Spec('string',  r'"[^"]*"'), # '\"' escapes are ignored
    ]
    useless = ['comment', 'newline', 'space']
    t = make_tokenizer(specs)
    return [x for x in t(str) if x.type not in useless]

id = sometoks(['name', 'number', 'string'])
make_graph_attr = lambda args: DefAttrs(u'graph', [Attr(*args)])
make_edge = lambda x, xs, attrs: Edge([x] + xs, attrs)

node_id = memoize(id) # + maybe(port)
a_list = (
    id +
    maybe(op_('=') + id) +
    skip(maybe(op(',')))
    >> unarg(Attr))
attr_list = (
    many(op_('[') + many(a_list) + op_(']'))
    >> flatten)
attr_stmt = (
   (n('graph') | n('node') | n('edge')) +
   attr_list
   >> unarg(DefAttrs))
graph_attr = id + op_('=') + id >> make_graph_attr
node_stmt = node_id + attr_list >> unarg(Node)
# We use a fwd() because of circular definitions like (stmt_list -> stmt ->
# subgraph -> stmt_list)
subgraph = fwd()
edge_rhs = skip(op('->') | op('--')) + (subgraph | node_id)
edge_stmt = (
    (subgraph | node_id) +
    oneplus(edge_rhs) +
    attr_list
    >> unarg(make_edge))
stmt = (
      attr_stmt
    | edge_stmt
    | subgraph
    | graph_attr
    | node_stmt
)
stmt_list = many(stmt + skip(maybe(op(';'))))
subgraph.define(memoize(
    skip(n('subgraph')) +
    maybe(id) +
    op_('{') +
    stmt_list +
    op_('}')
    >> unarg(SubGraph)))
graph = (
    maybe(n('strict')) +
    maybe(n('graph') | n('digraph')) +
    maybe(id) +
    op_('{') +
    stmt_list +
    op_('}')
    >> unarg(Graph))
dotfile = graph + skip(eof)

name_parser_vars(locals())

def parse(seq):
    'Sequence(Token) -> object'
    return dotfile.parse(seq)

def pretty_parse_tree(x):
    'object -> str'
    Pair = namedtuple('Pair', 'first second')
    p = lambda x, y: Pair(x, y)
    def kids(x):
        'object -> list(object)'
        if isinstance(x, (Graph, SubGraph)):
            return [p('stmts', x.stmts)]
        elif isinstance(x, (Node, DefAttrs)):
            return [p('attrs', x.attrs)]
        elif isinstance(x, Edge):
            return [p('nodes', x.nodes), p('attrs', x.attrs)]
        elif isinstance (x, Pair):
            return x.second
        else:
            return []
    def show(x):
        'object -> str'
        if isinstance(x, Pair):
            return x.first
        elif isinstance(x, Graph):
            return 'Graph [id=%s, strict=%r, type=%s]' % (
                x.id, x.strict is not None, x.type)
        elif isinstance(x, SubGraph):
            return 'SubGraph [id=%s]' % x.id
        elif isinstance(x, Edge):
            return 'Edge'
        elif isinstance(x, Attr):
            return 'Attr [name=%s, value=%s]' % (x.name, x.value)
        elif isinstance(x, DefAttrs):
            return 'DefAttrs [object=%s]' % x.object
        elif isinstance(x, Node):
            return 'Node [id=%s]' % x.id
        else:
            return unicode(x)
    return pretty_tree(x, kids, show)

def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)
    #import funcparserlib
    #funcparserlib.parser.debug = True
    try:
        stdin = os.fdopen(sys.stdin.fileno(), 'rb')
        input = stdin.read().decode(ENCODING)
        tree = parse(tokenize(input))
        #print pformat(tree)
        print pretty_parse_tree(tree).encode(ENCODING)
    except SyntaxError, e:
        msg = (u'syntax error: %s' % e).encode(ENCODING)
        print >> sys.stderr, msg
        sys.exit(1)

if __name__ == '__main__':
    main()

